Retailer Analysis¶
Here through the analysis we'll get to know:
- How many transactions per retailer?
- Which retailers bring the highest revenue?
- Comparing Operating Margin across retailers
- Who sells the most units vs who generates the most profit?
- Which retailer dominates in which region/state.
- Product Mix by Retailer: Are some retailers focused on footwear vs apparel, etc.?
- Sales Method by Retailer: Compare if certain retailers lean online/offline
In [2]:
import pandas as pd
df = pd.read_csv(r"C:\Users\lakshita rawat\Downloads\Project_2\Dataset_1.csv")
df.columns
Out[2]:
Index(['Retailer', 'Months', 'Year', 'Invoice Date', 'Region', 'State', 'City', 'Product', 'Price per Unit($)', 'Units Sold', 'Total Sales($)', 'Operating Profit($)', 'Operating Margin', 'Sales Method'], dtype='object')
In [ ]:
import matplotlib.pyplot as plt
# Count number of invoices per retailer (proxy for transactions)
transactions = df.groupby('Retailer')['Invoice Date'].nunique().sort_values(ascending=False)
ax = transactions.plot(kind='bar', figsize=(10,6), color="skyblue")
for i, v in enumerate(transactions):
ax.text(i, v, f'{int(v)}', ha='center', va='bottom')
# Visualization
transactions.plot(kind='bar', figsize=(10,5), color='skyblue')
plt.title("Number of Transactions per Retailer")
plt.ylabel("Transactions")
plt.show()
# Highest revenue by the retailer
revenue = df.groupby('Retailer')['Total Sales($)'].sum().sort_values(ascending=False)
ax = revenue.plot(kind='bar', figsize=(10,6), color="skyblue")
for i, v in enumerate(revenue):
ax.text(i, v, f'{int(v)}', ha='center', va='bottom')
# Visualization
revenue.plot(kind='bar', figsize=(10,5), color='lightgreen')
plt.title("Total Revenue per Retailer")
plt.ylabel("Revenue ($)")
plt.show()
# Operating Margin across the Retailers
margin = df.groupby('Retailer')['Operating Margin'].mean().sort_values(ascending=False)
print(margin)
# Visualization
margin.plot(kind='bar', figsize=(10,5), color='orange')
plt.title("Average Operating Margin per Retailer")
plt.ylabel("Operating Margin (%)")
plt.show()
# Units Sold VS Profit Generated
import seaborn as sns
units_vs_profit = df.groupby('Retailer').agg({
'Units Sold':'sum',
'Operating Profit($)':'sum'
}).sort_values(by='Operating Profit($)', ascending=False)
print(units_vs_profit)
# Visualization: Units vs Profit
plt.figure(figsize=(8,6))
sns.scatterplot(
x='Units Sold',
y='Operating Profit($)',
data=units_vs_profit.reset_index(),
hue='Retailer', s=120
)
plt.title("Units Sold vs Operating Profit per Retailer")
plt.xlabel("Units Sold")
plt.ylabel("Operating Profit ($)")
plt.show()
## Sales contribution per retailer by region
region_retailer = df.groupby(['Region','Retailer'])['Total Sales($)'].sum().reset_index()
print(region_retailer)
plt.figure(figsize=(12,6))
sns.barplot(x='Region', y='Total Sales($)', hue='Retailer', data=region_retailer)
plt.title("Retailer Sales Contribution by Region")
plt.ylabel("Total Sales ($)")
plt.xlabel("Region")
plt.legend(title="Retailer")
plt.show()
# Sales split by product category per retailer
product_mix = df.groupby(['Retailer','Product'])['Total Sales($)'].sum().reset_index()
print(product_mix)
plt.figure(figsize=(12,6))
sns.barplot(x='Retailer', y='Total Sales($)', hue='Product', data=product_mix)
plt.title("Product Mix by Retailer")
plt.ylabel("Total Sales ($)")
plt.xlabel("Retailer")
plt.legend(title="Product")
plt.show()
# Retailer-wise sales method comparison
sales_method = df.groupby(['Retailer','Sales Method'])['Total Sales($)'].sum().reset_index()
print(sales_method)
plt.figure(figsize=(12,6))
sns.barplot(x='Retailer', y='Total Sales($)', hue='Sales Method', data=sales_method)
plt.title("Sales Method Contribution by Retailer")
plt.ylabel("Total Sales ($)")
plt.xlabel("Retailer")
plt.legend(title="Sales Method")
plt.show()
Retailer Sports Direct 0.444892 Kohl's 0.419291 Foot Locker 0.417933 Amazon 0.417871 West Gear 0.417856 Walmart 0.406534 Name: Operating Margin, dtype: float64
Units Sold Operating Profit($) Retailer West Gear 625262 85667968 Foot Locker 604369 80722234 Sports Direct 557640 74333022 Kohl's 287375 36811302 Amazon 197990 28818533 Walmart 206225 25782063
Region Retailer Total Sales($) 0 Midwest Amazon 16835873 1 Midwest Foot Locker 47987394 2 Midwest Kohl's 22229415 3 Midwest Sports Direct 26207191 4 Midwest West Gear 22540586 5 Northeast Amazon 36262590 6 Northeast Foot Locker 65326474 7 Northeast Kohl's 14031168 8 Northeast Sports Direct 24698097 9 Northeast Walmart 13712005 10 Northeast West Gear 32293733 11 South Amazon 409091 12 South Foot Locker 9307025 13 South Kohl's 3552055 14 South Sports Direct 65258121 15 South Walmart 33049858 16 South West Gear 33087031 17 Southeast Amazon 10826333 18 Southeast Foot Locker 59669118 19 Southeast Sports Direct 54178543 20 Southeast Walmart 21005539 21 Southeast West Gear 17491703 22 West Amazon 13365025 23 West Foot Locker 37804709 24 West Kohl's 62302115 25 West Sports Direct 12129045 26 West Walmart 6791008 27 West West Gear 137551280
Retailer Product Total Sales($) 0 Amazon Men's Apparel 10474770 1 Amazon Men's Athletic Footwear 12011959 2 Amazon Men's Street Footwear 22161652 3 Amazon Women's Apparel 15710639 4 Amazon Women's Athletic Footwear 7935255 5 Amazon Women's Street Footwear 9404637 6 Foot Locker Men's Apparel 29508995 7 Foot Locker Men's Athletic Footwear 36480415 8 Foot Locker Men's Street Footwear 57481575 9 Foot Locker Women's Apparel 43296114 10 Foot Locker Women's Athletic Footwear 24239624 11 Foot Locker Women's Street Footwear 29087997 12 Kohl's Men's Apparel 14164965 13 Kohl's Men's Athletic Footwear 17885316 14 Kohl's Men's Street Footwear 22421073 15 Kohl's Women's Apparel 20315239 16 Kohl's Women's Athletic Footwear 12493869 17 Kohl's Women's Street Footwear 14834291 18 Sports Direct Men's Apparel 24825286 19 Sports Direct Men's Athletic Footwear 30889734 20 Sports Direct Men's Street Footwear 38672270 21 Sports Direct Women's Apparel 37113257 22 Sports Direct Women's Athletic Footwear 23145288 23 Sports Direct Women's Street Footwear 27825162 24 Walmart Men's Apparel 11490680 25 Walmart Men's Athletic Footwear 13600404 26 Walmart Men's Street Footwear 14924255 27 Walmart Women's Apparel 15316099 28 Walmart Women's Athletic Footwear 8762792 29 Walmart Women's Street Footwear 10464180 30 West Gear Men's Apparel 33263936 31 West Gear Men's Athletic Footwear 42805852 32 West Gear Men's Street Footwear 53165419 33 West Gear Women's Apparel 47287512 34 West Gear Women's Athletic Footwear 30055068 35 West Gear Women's Street Footwear 36386546
Retailer Sales Method Total Sales($) 0 Amazon In-store 22366250 1 Amazon Online 28909731 2 Amazon Outlet 26422931 3 Foot Locker In-store 76525000 4 Foot Locker Online 72943290 5 Foot Locker Outlet 70626430 6 Kohl's In-store 29566250 7 Kohl's Online 30992229 8 Kohl's Outlet 41556274 9 Sports Direct In-store 55048500 10 Sports Direct Online 59225997 11 Sports Direct Outlet 68196500 12 Walmart In-store 16411250 13 Walmart Online 15069494 14 Walmart Outlet 43077666 15 West Gear In-store 156726500 16 West Gear Online 40532141 17 West Gear Outlet 45705692
Product Analysis¶
Here through the analysis we'll get to know:
- Which product categories generate the most revenue?
- Which products are most profitable (high Operating Profit($))?
- Which products are loss-making (negative profit)?
- Which products have the highest Operating Margin?
- Which products show the highest Average Order Value (AOV)? (Total Sales($)/Transactions)
- Seasonality in product sales: Which products peak in certain months (sportswear could link to events)?
- Which sales method is mostly used by which product?
- Top products by region/state
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Revenue by Product
revenue_by_product = df.groupby('Product')['Total Sales($)'].sum().sort_values(ascending=False)
plt.figure(figsize=(10,5))
ax = revenue_by_product.plot(kind='bar', color='skyblue')
plt.title("Revenue Contribution by Product")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=45, ha='right')
# Annotation
for i, v in enumerate(revenue_by_product):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()
# 2. Profit by Product
profit_by_product = df.groupby('Product')['Operating Profit($)'].sum().sort_values(ascending=False)
plt.figure(figsize=(10,5))
ax = profit_by_product.plot(kind='bar', color='lightgreen')
plt.title("Profit Contribution by Product")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotation
for i, v in enumerate(profit_by_product):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()
# 3. Units Sold vs State (Heatmap)
top_states = df.groupby('State')['Units Sold'].sum().nlargest(10).index
filtered_df = df[df['State'].isin(top_states)]
heatmap_data = filtered_df.pivot_table(
values='Units Sold',
index='State',
columns='Product',
aggfunc='sum',
fill_value=0
)
plt.figure(figsize=(12,6))
sns.heatmap(heatmap_data, cmap="YlGnBu", annot=True, fmt="d", cbar_kws={'label': 'Units Sold'})
plt.title("Units Sold by Product in Top 10 States")
plt.xlabel("Product")
plt.ylabel("State")
plt.tight_layout()
plt.show()
print()
# 4. Loss-Making Products
df['Invoice Date'] = pd.to_datetime(df['Invoice Date'], errors='coerce')
loss_products = df.groupby('Product')['Operating Profit($)'].sum()
loss_products = loss_products[loss_products < 0].sort_values()
if loss_products.empty:
print("No loss-making products found.")
else:
plt.figure(figsize=(10,6))
ax = loss_products.plot(kind='bar', color='red')
plt.title("Loss-Making Products by Operating Profit")
plt.ylabel("Operating Profit ($)")
plt.xlabel("Product")
plt.xticks(rotation=45, ha='right')
# Annotation
for i, v in enumerate(loss_products):
ax.text(i, v, f'{int(v):,}', ha='center', va='top' if v < 0 else 'bottom', fontsize=9)
plt.tight_layout()
plt.show()
print()
print()
# 5. Highest Operating Margin
margin_by_product = df.groupby('Product')['Operating Margin'].mean().sort_values(ascending=False)
# Print values before plotting
print("Average Operating Margin by Product:")
print(margin_by_product)
plt.figure(figsize=(10,5))
margin_by_product.plot(kind='bar', color='orange')
plt.title("Average Operating Margin by Product")
plt.ylabel("Operating Margin (%)")
plt.xticks(rotation=45, ha='right')
plt.show()
# 6. Average Order Value (AOV) by Product
aov_by_product = df.groupby('Product')['Total Sales($)'].sum() / df.groupby('Product')['Invoice Date'].nunique()
aov_by_product = aov_by_product.sort_values(ascending=False)
# Print values before plotting
print("\nAverage Order Value (AOV) by Product:")
print(aov_by_product)
plt.figure(figsize=(10,5))
aov_by_product.plot(kind='bar', color='purple')
plt.title("Average Order Value (AOV) by Product")
plt.ylabel("AOV ($)")
plt.xticks(rotation=45, ha='right')
plt.show()
# 7. Seasonality in Product Sales
monthly_product_sales = df.groupby(['Months','Product'])['Total Sales($)'].sum().reset_index()
plt.figure(figsize=(12,6))
sns.lineplot(x='Months', y='Total Sales($)', hue='Product', data=monthly_product_sales, marker='o')
plt.title("Monthly Sales Trend by Product")
plt.ylabel("Total Sales ($)")
plt.xlabel("Month")
plt.show()
# 8. Sales method and the Product
# Group by Product and Sales Method → sum of Units Sold
product_sales_method = df.groupby(['Product', 'Sales Method'])['Units Sold'].sum().unstack(fill_value=0)
# Plot stacked bar chart
ax = product_sales_method.plot(
kind='bar',
stacked=True,
figsize=(12,6),
colormap="tab20"
)
plt.title("Sales Method Distribution by Product")
plt.xlabel("Product")
plt.ylabel("Units Sold")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Sales Method")
plt.tight_layout()
# --- Add annotations on each stacked bar section ---
for container in ax.containers:
ax.bar_label(container, label_type="center", fontsize=8, color="black")
plt.show()
# 9. Top Products by Region
region_product_sales = df.groupby(['Region','Product'])['Total Sales($)'].sum().reset_index()
plt.figure(figsize=(12,6))
sns.barplot(x='Region', y='Total Sales($)', hue='Product', data=region_product_sales)
plt.title("Top Products by Region")
plt.ylabel("Total Sales ($)")
plt.xlabel("Region")
plt.show()
No loss-making products found. Average Operating Margin by Product: Product Men's Street Footwear 0.446130 Women's Apparel 0.441318 Women's Athletic Footwear 0.424359 Men's Apparel 0.413225 Women's Street Footwear 0.410199 Men's Athletic Footwear 0.402702 Name: Operating Margin, dtype: float64
Average Order Value (AOV) by Product: Product Men's Street Footwear 1.373857e+06 Women's Apparel 1.147685e+06 Men's Athletic Footwear 9.850877e+05 Women's Street Footwear 8.258246e+05 Men's Apparel 7.880805e+05 Women's Athletic Footwear 6.924149e+05 dtype: float64
Profitabiity Analysis¶
Here through the analysis we'll get to know:
- Which retailer has the highest profit contribution?
- Which regions are profitable vs unprofitable?
- Which products have low sales but high margins (hidden gems)?
- Which retailers have high sales but low margins (risky volume play)?
- Which Sales Method is making more profit?
- Which State is making more profit?
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Which retailer has the highest profit contribution?
profit_by_retailer = df.groupby('Retailer')['Operating Profit($)'].sum().sort_values(ascending=False)
plt.figure(figsize=(10,5))
ax = profit_by_retailer.plot(kind='bar', color="lightgreen")
plt.title("Profit Contribution by Retailer")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotate values
for i, v in enumerate(profit_by_retailer):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False) # removes grid from left axis
plt.show()
# 2. Which regions are profitable vs unprofitable?
region_profit = df.groupby('Region')['Operating Profit($)'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,5))
ax = region_profit.plot(kind='bar', color="skyblue")
plt.title("Profitability by Region")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotate values
for i, v in enumerate(region_profit):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False) # removes grid from left axis
plt.show()
# 3. Hidden gems: Products with Low Sales but High Margins
product_summary = df.groupby('Product').agg({
'Total Sales($)': 'sum',
'Operating Margin': 'mean'
}).reset_index()
# Use median as threshold
sales_median = product_summary['Total Sales($)'].median()
margin_median = product_summary['Operating Margin'].median()
hidden_gems = product_summary[
(product_summary['Total Sales($)'] < sales_median) &
(product_summary['Operating Margin'] > margin_median)
]
print("Hidden Gem Products (Low Sales, High Margin):")
print(hidden_gems)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Product Summary
product_summary = df.groupby('Product').agg({
'Total Sales($)': 'sum',
'Operating Margin': 'mean'
}).reset_index()
product_summary['Margin_Display'] = product_summary['Operating Margin'].round(2)
x = np.arange(len(product_summary['Product']))
width = 0.4
fig, ax1 = plt.subplots(figsize=(12,6))
# --- Left axis: Total Sales ---
bars1 = ax1.bar(x - width/2, product_summary['Total Sales($)'], width,
label='Total Sales ($)', color='skyblue')
ax1.set_ylabel("Total Sales ($)", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")
# --- Right axis: Operating Margin ---
ax2 = ax1.twinx()
bars2 = ax2.bar(x + width/2, product_summary['Operating Margin'], width,
label='Operating Margin (%)', color='orange')
ax2.set_ylabel("Operating Margin (%)", color="orange")
ax2.tick_params(axis="y", labelcolor="orange")
# --- Annotations for Sales ---
for bar in bars1:
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
f'{int(height):,}', ha='center', va='bottom', fontsize=9)
# --- Annotations for Margin ---
for bar, val in zip(bars2, product_summary['Margin_Display']):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
f'{val:.2f}%', ha='center', va='bottom', fontsize=9)
# --- Formatting ---
ax1.set_title("Product-wise Sales vs Operating Margin", fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(product_summary['Product'], rotation=45, ha='right')
ax1.grid(False) # removes grid from left axis
ax2.grid(False) # removes grid from right axis
fig.tight_layout()
plt.show()
print()
# 4. Risky Volume Play: Retailers with High Sales but Low Margins
retailer_summary = df.groupby('Retailer').agg({
'Total Sales($)': 'sum',
'Operating Margin': 'mean'
}).reset_index()
sales_median = retailer_summary['Total Sales($)'].median()
margin_median = retailer_summary['Operating Margin'].median()
risky_retailers = retailer_summary[
(retailer_summary['Total Sales($)'] > sales_median) &
(retailer_summary['Operating Margin'] < margin_median)
]
print("Risky Volume Play Retailers (High Sales, Low Margin):")
print(risky_retailers)
# --- 1. Retailer Summary ---
retailer_summary = df.groupby('Retailer').agg({
'Total Sales($)': 'sum',
'Operating Margin': 'mean'
}).reset_index()
retailer_summary['Margin_Display'] = retailer_summary['Operating Margin'].round(2)
x = np.arange(len(retailer_summary['Retailer']))
width = 0.4
fig, ax1 = plt.subplots(figsize=(12,6))
# --- Left axis: Total Sales ---
bars1 = ax1.bar(x - width/2, retailer_summary['Total Sales($)'], width,
label='Total Sales ($)', color='lightgreen')
ax1.set_ylabel("Total Sales ($)", color="green")
ax1.tick_params(axis="y", labelcolor="green")
# --- Right axis: Operating Margin ---
ax2 = ax1.twinx()
bars2 = ax2.bar(x + width/2, retailer_summary['Operating Margin'], width,
label='Operating Margin (%)', color='coral')
ax2.set_ylabel("Operating Margin (%)", color="red")
ax2.tick_params(axis="y", labelcolor="red")
# --- Annotations for Sales ---
for bar in bars1:
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
f'{int(height):,}', ha='center', va='bottom', fontsize=9)
# --- Annotations for Margin ---
for bar, val in zip(bars2, retailer_summary['Margin_Display']):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
f'{val:.2f}%', ha='center', va='bottom', fontsize=9)
# --- Formatting ---
ax1.set_title("Retailer-wise Sales vs Operating Margin", fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(retailer_summary['Retailer'], rotation=45, ha='right')
ax1.grid(False)
ax2.grid(False)
fig.tight_layout()
plt.show()
# 5. Which Sales Method is making more profit
method_profit = df.groupby('Sales Method')['Operating Profit($)'].sum().sort_values(ascending=False)
plt.figure(figsize=(8,5))
ax = method_profit.plot(kind='bar', color="orange")
plt.title("Profit Contribution by Sales Method")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotate values
for i, v in enumerate(method_profit):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False) # removes grid from left axis
plt.show()
# 6. Which State is making more profit?
state_profit = df.groupby('State')['Operating Profit($)'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(12,6))
ax = state_profit.plot(kind='bar', color="teal")
plt.title("Top 10 States by Profit Contribution")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotate values
for i, v in enumerate(state_profit):
ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False) # removes grid from left axis
plt.show()
Hidden Gem Products (Low Sales, High Margin): Product Total Sales($) Operating Margin 4 Women's Athletic Footwear 106631896 0.424359
Risky Volume Play Retailers (High Sales, Low Margin): Retailer Total Sales($) Operating Margin 5 West Gear 242964333 0.417856
Correlation Analysis¶
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])
# Compute correlation matrix
corr = numeric_df.corr()
# Plot heatmap
plt.figure(figsize=(10,7))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Analysis Heatmap", fontsize=14)
plt.show()